library(readxl)
library(quanteda.textmodels)
library(caret)
library(tidyverse)
library(quanteda)
library(readtext)
# Training and Testing before the whole corpus, skip to load for full data
#PosNames <- c("Solve Case","Initiatives","Cop_Danger","Community","Tech","Oth_Danger","Fam","Int")
#NegNames <- c("Racism","Sex", "Het", "Corr","DV","Vice","NegFam","Mistake","Crude")
#DataNames <- c("Show Name","Season:","Season Ep #:","Ep Title:","Coder Rating","ID","NegTot", 
#               "PosTot")
# Here are the training / testing seasons / episodes
#files <- list.files("C:/Users/esteb/Dropbox (MIT)/Summer 2023/Text Project/CodingEMF2",
#                    full.names = TRUE)

# empty dataframe for filling. "The bucket"
#buck <- data.frame(matrix(nrow=0,ncol = 0))
# loop over, read the important stuff
#for(i in 1:length(files)){
#file <- files[i]
#DF <- read_excel(file, range = "K4:M11", col_names = FALSE)
#DF <- rowSums(DF)
#postot <- sum(DF) 
#file
# read the negative sheet
#Neg <- read_excel(file, sheet = "Negative", range = "I4:K12", col_names = FALSE)
#Neg <- rowSums(Neg)
#NegTot <- sum(Neg)
# this is metadata about the episode / show
#Data <- read_excel(file, sheet = "Data", col_names = FALSE, range="A2:F2")
#Data <- matrix(Data)
# create the row
#Obs <- data.frame(c(DF,Neg,Data, NegTot, postot))
#colnames(Obs) <- colnames(buck)
#buck <- rbind(buck,Obs)
#} 
# 
#colnames(buck) <- c(PosNames,NegNames,DataNames)
# calculate the difference for valence based on categories
#buck$Diff <- buck$PosTot - buck$NegTot
# Create valence for coder assigned ratings
#buck$Valence <- 0
#buck$Valence[which(buck$`Coder Rating`=="Positive")] <- 1
#buck$Valence[which(buck$`Coder Rating`=="positive")] <- 1
#  Valence2, the valence based on categories
#buck$Valence2 <- 0
#buck$Valence2[which(buck$Diff>0)] <- 1

# Bring in the transcripts for these episodes
#trans <- "C:/Users/esteb/Dropbox (MIT)/Summer 2023/Text Project/Transcripts2"
#scrip <-  readtext(trans,docvarnames = c("Show","Season","Episode","ID"), 
#                   docvarsfrom = "filenames",dvsep = "_")

# merge in more data for later. this is season and year
#DF <- merge(buck,scrip,by="ID")
#Index_2 <- read_excel("C:/Users/esteb/Dropbox (MIT)/Summer 2023/Text Project/Index_2.xlsx")
#DF <- merge(DF,Index_2,by="ID")

#DF$text<-  paste(DF$text,DF$positive)

#save(DF,file="Testing.rda")
load("Testing.rda")
# create the corpus

### F1  about .857, F1 within positive is great. 
## Definitely good enough to move forward 
###

#trans2 <- "C:/Users/esteb/OneDrive/Desktop/OOS Preds"
#scrip2 <-  readtext(trans2,docvarnames = c("Show","Season"), 
#                   docvarsfrom = "filenames",dvsep = "_")

# now to predict the rest of the shows
#Corp2 <- corpus(scrip2)
# 

### these are saved as R files because I scraped them
## load them in
loadRData <- function(fileName){
  #loads an RData file, and returns it
  load(fileName)
  get(ls()[ls() != "fileName"])
}
#setwd("C:/Users/esteb/OneDrive/Documents")
##
### the LARGe bucket to fill
#Bucket <- data.frame(matrix(ncol=40,nrow=200))
#bur <- rep("Ep",37)
#jig <- paste0(bur,1:37)
# loop over all the R files and predict within. then gather and report
#colnames(Bucket) <- c("show","Positive","Total",jig)
#for(i in 1:length(vec)){
#  Z <- loadRData(vec[i])
#  Corp2 <-corpus (Z$final)
#  JustAToke2 <- tokens(Corp2,remove_punct = TRUE) %>% 
#    tokens_remove(stopwords("en")) %>% 
#    tokens_wordstem()
#  DFM2 <- dfm(JustAToke2)
#  DFM3 <- dfm_match(DFM2,DFM)
#  predicted2 <- predict(NB, newdata=DFM2,force=TRUE)
#  Bucket$Positive[i] <- sum(as.numeric(as.character(unlist(predicted2))))
#  Bucket$show[i] <- vec[i]
#  Bucket$Total[i] <- length(predicted2)
#  Bucket[i,4:(3+length(predicted2))] <- predicted2
#                     }
##
#write_xlsx(Bucket, "TextOut.xlsx")
#library(writexl)
TextOut <- read_excel("TextOut.xlsx")
Bucket <- TextOut
Bucket$Percent <- Bucket$Positive/Bucket$Total

## lets calculate some stats

sum(Bucket$Positive,na.rm=TRUE)/sum(Bucket$Total,na.rm=TRUE)
### assign years 
##### Average by year, average by show,
# Get season and name seperated
TextOut$Season <- parse_number(TextOut$show)
TextOut$Season[1:5] <- 1:5
TextOut$show <- gsub('[[:digit:]]+', '', TextOut$show)
TextOut$show[1:5] <- "911"
TextOut$show <- gsub('.Rda', '', TextOut$show)
table(TextOut$show)
### aggregation by show
purse <- aggregate(TextOut$Positive,list(TextOut$show),sum)
snatch <- aggregate(TextOut$Total,list(TextOut$show),sum)
purse$percent <- purse[,2]/snatch[,2]
# prep for visual, 
fen <- aggregate(TextOut$Percent,list(TextOut$show),mean)
colnames(fen)[1] <- "Show"
###
avg_positive_by_year <- TextOut %>%
  filter(Year >= 2000) %>%
  group_by(Year) %>%
  summarize(
    avg_percent = mean(Positive / Total, na.rm = TRUE) * 100
  )

ggplot(avg_positive_by_year, aes(x = Year, y = avg_percent)) +
  geom_line() +
  geom_point() +
  scale_y_continuous(limits = c(0, 100)) +
  labs(
    title = "Average Positive Episodes by Year (as Percentage)",
    x = "Year",
    y = "Average Percent of Positive Episodes"
  ) +
  theme_minimal()
### 
# order so I can get the show names right
library(ggplot2)
fen <- fen[order(fen$x),]
purse <- purse[order(purse$percent),]
fen$eps <- purse$x
names(fen)[1] <- "Show"
# plot results
ggplot(fen,aes(x=reorder(Show,x),y=x))+
  geom_col()+
  geom_text(aes(label=eps),vjust=1.5,color="white")+
  xlab("Show")+
  ylab("Percent Positive")+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
###
#### CUSTOM DICTIONARY RESULTS #####
# create custom dictionary
custom_dict <- data.frame(
  keyword = c("racism","brutality","protest","prejudice","sexism","bribe"
              ,"corrupt","racist","bias","sexist","minority"),
  theme = c("Racism","Racism","Racism","Racism","Sexism","Corruption",
            "Corruption","Racism","Racism",
            "Sexism","Racism"),
  stringsAsFactors = FALSE
)

## initialize empty list 
#grocerystore <- grocery
#grocery <- list()
#setwd("C:/Users/Esteb/OneDrive/Documents")
#vec<- list.files(pattern=".Rda$")



# create a DFM with all the important words from a script
# functions for assigning theme to workds
match_themes <- function(term) {
  matched <- custom_dict[custom_dict$keyword == term, "theme"]
  if(length(matched) > 0) return(matched)
  return(NA)
}
# count them 
#for(i in 1:length(vec)){
#  Z <- loadRData(vec[i])
#  Corp2 <-corpus (Z$final)
#  JustAToke2 <- tokens(Corp2,remove_punct = TRUE) %>% 
#    tokens_remove(stopwords("en")) %>% 
#    tokens_wordstem()
#  DFM2 <- dfm(JustAToke2)
#  ditter <- as.matrix(DFM2)
#  terms <- colnames(ditter)
#  term_themes <- sapply(terms, match_themes)
#  theme_counts <- data.frame(
#    term = terms,
##    theme = unlist(term_themes),
#    freq = colSums(ditter),
#    stringsAsFactors = FALSE
#  )
#  theme_summary <- theme_counts %>%
#    group_by(theme) %>%
#    summarise(total_freq = sum(freq)) %>%
#    arrange(desc(total_freq))
  
#  Racism <- ifelse(length(which(theme_summary$theme == "Racism"))<1,0,
#                   theme_summary$total_freq[which(theme_summary$theme == "Racism")])
  
#  Sexism <- ifelse(length(which(theme_summary$theme == "Sexism"))<1,0,
#                   theme_summary$total_freq[which(theme_summary$theme == "Sexism")])
  
#  Corrupt <- ifelse(length(which(theme_summary$theme == "Corruption"))<1,0,
#                    theme_summary$total_freq[which(theme_summary$theme == "Corruption")])
  
  
  
#  grocery[[i]] <- c(vec[i],Racism,Sexism,Corrupt,nrow(Z))
#}

#CustTextOut <- as.data.frame(do.call(rbind, grocery))
### create final dataframe
TextOut <- read_excel("TextOut.xlsx")
load("CustTextOut.rda")

#save(CustTextOut,file="CustTextOut.rda")

colnames(CustTextOut) <- c("Show","Racism","Sexism","Corrupt","Total")
# assign years

CustTextOut$Year <- TextOut$Year
CustTextOut$Percent <- as.numeric(CustTextOut$Racism)/as.numeric(CustTextOut$Total)
### remove wrong years
filtered_data <- subset(CustTextOut, Year >= 2000)

# Calculate mean Percent by Year
mean_percent_by_year <- aggregate(Percent ~ Year, data = filtered_data, FUN = mean)

# Plot the time series
ggplot(mean_percent_by_year, aes(x = Year, y = Percent)) +
  geom_line() + 
  geom_point() +
  labs(
    title = "",
    x = "Year",
    y = "Percent of Episodes w/ Racism Terms"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold")
  )
####

CustTextOut <- CustTextOut %>%
  mutate(
    Year = as.numeric(Year),
    Racism = as.numeric(Racism),
    Total = as.numeric(Total),
    PercentRacism = Racism / Total
  ) %>%
  filter(Year >= 2000)
racism_yearly <- CustTextOut %>%
  group_by(Year) %>%
  summarise(
    AvgPercentRacism = mean(PercentRacism, na.rm = TRUE),
    N = n()
  )

racism_yearly <- CustTextOut %>%
  group_by(Year) %>%
  summarise(
    AvgPercentRacism = mean(PercentRacism, na.rm = TRUE),
    N = n()
  )

ggplot(racism_yearly, aes(x = Year, y = AvgPercentRacism)) +
  geom_line(color = "firebrick", size = 1.2) +
  geom_point(color = "black", size = 2) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  labs(
    title = "Average % of Episodes with Racism Mentions by Year",
    x = "Year",
    y = "% of Episodes",
    caption = "Based on keyword dictionary tagging"
  ) +
  theme_minimal(base_size = 14)

### CUSTOM DICT FOR COMMENTS
# Load necessary libraries
# Load necessary libraries
library(quanteda)
library(dplyr)
library(ggplot2)
# Define custom dictionary of political/race words (case-insensitive matching)
political_race_words <- tolower(c("woke", "blm", "liberal", "conservative", "progressive", 
                                  "socialist", "communist", "fascist", "trump", "biden", 
                                  "republican", "democrat", "racist", "sexist", "systemic",
                                  "equity", "diversity", "inclusion", "white supremacy", 
                                  "critical race theory", "CRT", "leftist", "right-wing"))

# Initialize storage for results
results_list <- list()
# loop 

#for(i in 1:length(vec)) {
  # Load data for the current episode
#  Z <- loadRData(vec[i])
  
  # Extract filename for later merging
#  filename <- vec[i]  # Store the filename, assuming it contains show/season info
  
  # Convert comments to lowercase for matching
#  comments_text <- tolower(Z$coms)
  
  # Tokenize comments without stemming
#  tokens_coms <- tokens(comments_text, remove_punct = TRUE) %>% 
#    tokens_remove(stopwords("en"))  # No stemming
  
  # Create document-feature matrix
#  DFM_coms <- dfm(tokens_coms)
  
  # Count occurrences of political/race words
#  matching_counts <- sum(colSums(DFM_coms[, featnames(DFM_coms) %in% political_race_words]))
  
  # Store results
#  results_list[[i]] <- data.frame(
#    Filename = filename,  # Keep the filename for later merging
#    PoliticalWordCount = matching_counts
#  )
#}
### Same but percentages
results_list <- list()

#for (i in 1:length(vec)) {
  # Load data for the current season
#  Z <- loadRData(vec[i])
#  filename <- vec[i]
  
  # Total number of episodes
#  total_episodes <- nrow(Z)
  
  # Convert all comments to lowercase
#  comments_text <- tolower(Z$coms)
  
  # Tokenize (no stemming, no punctuation, no stopwords)
#  tokens_coms <- tokens(comments_text, remove_punct = TRUE) %>%
#    tokens_remove(stopwords("en"))
  
  # Create DFM
#  DFM_coms <- dfm(tokens_coms)
  
  # Check for mentions in each episode (row of DFM)
#  mentions_per_episode <- rowSums(DFM_coms[, featnames(DFM_coms) %in% political_race_words])
  
  # Count episodes with ≥ 1 mention
#  episodes_with_mentions <- sum(mentions_per_episode > 0)
  
  # Compute percentage
#  pct_with_mentions <- episodes_with_mentions / total_episodes
  
  # Store results
#  results_list[[i]] <- data.frame(
#    Filename = filename,
#    EpisodesWithMentions = episodes_with_mentions,
#    TotalEpisodes = total_episodes,
#    PercentWithMentions = pct_with_mentions
#  )
#}
#final_results <- bind_rows(results_list)
#save(final_results,file="final_results.rda")
load("final_results.rda")
final_results$Year <- TextOut$Year
final_results_filtered <- final_results %>% 
  dplyr::filter(Year >= 2000)


head(final_results)
##
yearly_summary <- final_results_filtered %>%
  group_by(Year) %>%
  summarise(
    AvgPercentWithMentions = sum(EpisodesWithMentions, na.rm = TRUE) / sum(TotalEpisodes, na.rm = TRUE),
    TotalEpisodes = sum(TotalEpisodes, na.rm = TRUE),
    NumSeasons = n()
  )

ggplot(yearly_summary, aes(x = Year, y = AvgPercentWithMentions)) +
  geom_line(color = "steelblue", size = 1.2) +
  geom_point(color = "black", size = 2) +
  scale_y_continuous(
    labels = scales::percent_format(accuracy = 1),
    limits = c(0, 0.3)
  ) +
  labs(
    title = "% of Episodes with Political/Race Related Comments by Year",
    x = "Year",
    y = "Percent of Episodes with Mentions"
  ) +
  theme_minimal(base_size = 14)


